## This file calculates the average length of rules, number of rules, and out of
## sample accuracy for BRS and QCA solutions

# Load data and results
load(file="sim/out/simData.rda")
load(file="sim/out/simIndices.rda")
load(file="sim/out/sim_out_QCA.rda")  # QCA rule sets
load(file="sim/out/sim_out_combined.rda")  # BRS rule sets

## Number of rules and length of rules ------------------
# BRS
avgNumRules_BRS <- list()
avgLenRules_BRS <- list()
for(i in 1:length(allRuleSets_BRS_pois)){ # J & DGP
  avgNumRules <- c()
  avgLenRules <- c()
  for(ruleSets in allRuleSets_BRS_pois[[i]]){ # N
    numRules <- c()
    lenRules <- c()
    for(ruleSet in ruleSets){ # iteration
      numRules <- c(numRules, length(ruleSet))
      for(rule in ruleSet){ # For each rule in the rule sett
        lenRules <- c(lenRules, length(rule)) 
      }
    }
    # Avg for J & DGP & N
    avgNumRules <- c(avgNumRules, mean(numRules))
    avgLenRules <- c(avgLenRules, mean(lenRules))
  }
  avgNumRules_BRS[[i]] <- avgNumRules
  avgLenRules_BRS[[i]] <- avgLenRules
}

# QCA
avgNumRules_QCA <- list()
avgLenRules_QCA <- list()
for(i in 1:length(allRuleSets_QCA)){ # J & DGP
  avgNumRules <- c()
  avgLenRules <- c()
  for(j in 1:length(allRuleSets_QCA[[i]])){ # N
    ruleSets <- lapply(allRuleSets_QCA[[i]][[j]], function(x) strsplit(x, split="[*]"))
    numRules <- c()
    lenRules <- c()
    for(ruleSet in ruleSets){ # iteration
      numRules <- c(numRules, length(ruleSet))
      for(rule in ruleSet){ # For each rule in the rule sett
        lenRules <- c(lenRules, length(rule)) 
      }
    }
    # Avg for J & DGP & N
    avgNumRules[j] <- mean(numRules)
    avgLenRules[j] <- mean(lenRules)
  }
  avgNumRules_QCA[[i]] <- avgNumRules
  avgLenRules_QCA[[i]] <- avgLenRules
}



## Out of sample accuracy --------------------------
allAvgPerformance_BRS <- list()
for(i in 1:length(allRuleSets_BRS_pois)){ # J & DGP
  avgPerformance_BRS <- c() # Rows = different N
  for(j in 1:length(allRuleSets_BRS_pois[[i]])) { # same J &DGP, different N
    ruleSets <- allRuleSets_BRS_pois[[i]][[j]]
    performance_BRS <- foreach (k=1:length(ruleSets)) %dopar% {
      ruleSet <- ruleSets[[k]]
      data <- allData[[i]][[k]]
      df <- data[-allIndices[[j]][[k]],-ncol(data)]
      Y <- data[-allIndices[[j]][[k]], ncol(data)] # Last column is Y
      # Reformat stored rule sets BRS
      sum(getYhat(df, ruleSet) == Y)/nrow(df)
    }
    avgPerformance_BRS <- c(avgPerformance_BRS, mean(unlist(performance_BRS)))
  }
  allAvgPerformance_BRS[[i]] <- avgPerformance_BRS
}
allAvgPerformance_QCA <- list()
for(i in 1:length(allRuleSets_QCA)){ # J & DGP
  avgPerformance_QCA <- c() # Rows = different N
  for(j in 1:length(allRuleSets_QCA[[i]])){ # N
    ruleSets <- allRuleSets_QCA[[i]][[j]]
    performance_QCA <- foreach (k=1:length(ruleSets)) %dopar% {
      A_QCA <- ruleSets[[k]]
      data <- allData[[i]][[k]]
      df <- data[-allIndices[[j]][[k]],-ncol(data)]
      Y <- data[-allIndices[[j]][[k]], ncol(data)] # Last column is Y
      ruleSet <- list()
      for(l in 1:length(A_QCA)){
        ruleSet[[l]] <- reformat(A_QCA[[l]])
      }
      sum(getYhat(df, ruleSet) == Y)/nrow(df)
    }
    avgPerformance_QCA <- c(avgPerformance_QCA, mean(unlist(performance_QCA)))
  }
  allAvgPerformance_QCA[[i]] <- avgPerformance_QCA
}



## Sort deterministic vs probabilistic and format data for making separate graphs -------------------------
# you can check what DGP is used for each by looking at the names of allData,
# which have the format J_pp_pn, where J=number of variables, pp=P(y=1 | x \in A), and pn=P(y=1 | x \notin A)
# (see the generate data section of sim_make-data.R)
names(allData)  # odd indices are deterministic (pp, pn)=(1, 0) and even are probabilistic (pp, pn)=(.75, .25)

avgNumRules_QCA_d <- c(); avgNumRules_QCA_p <- c()
avgLenRules_QCA_d <- c(); avgLenRules_QCA_p <- c()
avgNumRules_BRS_d <- c(); avgNumRules_BRS_p <- c()
avgLenRules_BRS_d <- c(); avgLenRules_BRS_p <- c()
avgAcc_BRS_p <- c(); avgAcc_QCA_p <- c(); 
avgAcc_BRS_d <- c(); avgAcc_QCA_d <- c();
# BRS
for(i in 1:length(allData)){
  if(i %% 2){ # If odd, then deterministic
    avgNumRules_BRS_d <- c(avgNumRules_BRS_d, avgNumRules_BRS[[i]])
    avgLenRules_BRS_d <- c(avgLenRules_BRS_d, avgLenRules_BRS[[i]])
    avgAcc_BRS_d <- c(avgAcc_BRS_d, allAvgPerformance_BRS[[i]])
  } else { # If even, probabilistic
    avgNumRules_BRS_p <- c(avgNumRules_BRS_p, avgNumRules_BRS[[i]])
    avgLenRules_BRS_p <- c(avgLenRules_BRS_p, avgLenRules_BRS[[i]])
    avgAcc_BRS_p <- c(avgAcc_BRS_p, allAvgPerformance_BRS[[i]])
  }
}
# QCA
for(i in 1:length(allData)){
  if(i %% 2){ # If odd, then deterministic
    avgNumRules_QCA_d <- c(avgNumRules_QCA_d, avgNumRules_QCA[[i]])
    avgLenRules_QCA_d <- c(avgLenRules_QCA_d, avgLenRules_QCA[[i]])
    avgAcc_QCA_d <- c(avgAcc_QCA_d, allAvgPerformance_QCA[[i]])
  } else { # If even, probabilistic
    avgNumRules_QCA_p <- c(avgNumRules_QCA_p, avgNumRules_QCA[[i]])
    avgLenRules_QCA_p <- c(avgLenRules_QCA_p, avgLenRules_QCA[[i]])
    avgAcc_QCA_p <- c(avgAcc_QCA_p, allAvgPerformance_QCA[[i]])
  }
}


## format as data frames and save --------------------
J <- c(5, 10, 20) # Number of variables
sampleSize <- c(25, 100, 250, 500, 750, 1000)
perf <- cbind(c(rep("BRS", length(J)*length(sampleSize)), rep("QCA", length(J)*length(sampleSize))), 
              rep(c(rep("5", length(sampleSize)), rep("10", length(sampleSize)), rep("20", length(sampleSize))), 2),
              rep(sampleSize, 3))

perf_p <- data.frame(perf,
                     c(avgNumRules_BRS_p, avgNumRules_QCA_p, NA, NA, NA, NA),  # NAs because QCA doesn't have solutions for some settigns
                     c(avgLenRules_BRS_p, avgLenRules_QCA_p, NA, NA, NA, NA),
                     c(avgAcc_BRS_p, avgAcc_QCA_p,  NA, NA, NA, NA),
                     stringsAsFactors = F)
colnames(perf_p) <- c("method", "J", "n", "nrules", "lenrules", "acc")
perf_p$n <- as.numeric(perf_p$n)
perf_p$J <- factor(perf_p$J, levels=c("5", "10", "20"))

perf_d <- data.frame(perf,
                     c(avgNumRules_BRS_d, avgNumRules_QCA_d),
                     c(avgLenRules_BRS_d, avgLenRules_QCA_d),
                     c(avgAcc_BRS_d, avgAcc_QCA_d),
                     stringsAsFactors = F)
colnames(perf_d) <- c("method", "J", "n", "nrules", "lenrules", "acc")
perf_d$n <- as.numeric(perf_d$n)
perf_d$J <- factor(perf_d$J, levels=c("5", "10", "20"))

performance_df <- list(perf_p, perf_d)

save(performance_df, file="sim/out/performance.rda")
